COVID 19 - some simple data visualizations for US data

Some other covid19 visualizations:

https://coronavirus.1point3acres.com/

https://coronavirus.jhu.edu/map.html

# data source https://www.census.gov/data/datasets/time-series/demo/popest/2010s-state-total.html and wikipedia
df_population <- data.frame(
  state = c("AK", "AL", "AR", "AS", "AZ", "CA", "CO", "CT", "DC", "DE", "FL", 
            "GA", "GU", "HI", "IA", "ID", "IL", "IN", "KS", "KY", "LA", "MA", 
            "MD", "ME", "MI", "MN", "MO", "MP", "MS", "MT", "NC", "ND", "NE", 
            "NH", "NJ", "NM", "NV", "NY", "OH", "OK", "OR", "PA", "PR", "RI", 
            "SC", "SD", "TN", "TX", "UT", "VA", "VI", "VT", "WA", "WI", "WV", "WY"),
  population = c(731545, 4903185, 3017804, 55465 , 7278717, 39512223, 5758736, 3565287, 705749, 973764, 21477737,
                 10617423, 165768, 1415872, 3155070, 1787065, 12671821, 6732219, 2913314, 4467673, 4648794, 6892503, 
                 6045680, 1344212,  9986857, 5639632, 6137428, 56882, 2976149, 1068778, 10488084, 762062, 1934408,
                 1359711, 8882190, 2096829, 3080156, 19453561, 11689100, 3956971, 4217737, 12801989, 3193694, 1059361,
                 5148714, 884659, 6829174, 28995881, 3205958, 8535519, 106977 , 623989, 7614893, 5822434, 1792147, 578759)
)

# The Atlantic Monthly Group (CC BY-NC 4.0)
# source: https://covidtracking.com/api

df_states <- fread("https://covidtracking.com/api/v1/states/daily.csv") %>% 
               replace(is.na(.), 0) %>%
               inner_join(df_population, by = "state")%>%
               mutate(date = as.Date(as.character(date), "%Y%m%d"))

tableau10 <- as.list(ggthemes_data[["tableau"]][["color-palettes"]][["regular"]][[1]][,2])$value
first_day <- as.Date("2020-03-15") # to select a date
today <-  as.Date(toString(max(df_states$date)))
  
kable(head(df_states, n = 3))
date state positive probableCases negative pending totalTestResultsSource totalTestResults hospitalizedCurrently hospitalizedCumulative inIcuCurrently inIcuCumulative onVentilatorCurrently onVentilatorCumulative recovered lastUpdateEt dateModified checkTimeEt death hospitalized hospitalizedDischarged dateChecked totalTestsViral positiveTestsViral negativeTestsViral positiveCasesViral deathConfirmed deathProbable totalTestEncountersViral totalTestsPeopleViral totalTestsAntibody positiveTestsAntibody negativeTestsAntibody totalTestsPeopleAntibody positiveTestsPeopleAntibody negativeTestsPeopleAntibody totalTestsPeopleAntigen positiveTestsPeopleAntigen totalTestsAntigen positiveTestsAntigen fips positiveIncrease negativeIncrease total totalTestResultsIncrease posNeg dataQualityGrade deathIncrease hospitalizedIncrease hash commercialScore negativeRegularScore negativeScore positiveScore score grade population
2021-03-05 AK 56886 0 0 0 totalTestsViral 1731628 33 1293 0 0 2 0 0 3/5/2021 03:59 2021-03-05T03:59:00Z 03/04 22:59 305 1293 0 2021-03-05T03:59:00Z 1731628 68693 1660758 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 141 0 56886 7144 56886 0 2 3 7c69ab3363700e0bfc193bc3374d28a89a0d1102 0 0 0 0 0 0 731545
2021-03-05 AL 498887 107518 1924758 0 totalTestsPeopleViral 2316127 526 45976 0 2672 0 1514 295690 3/5/2021 11:00 2021-03-05T11:00:00Z 03/05 06:00 10122 45976 0 2021-03-05T11:00:00Z 0 0 0 391369 7943 2179 0 2316127 0 0 0 119005 0 0 0 0 0 0 1 811 4223 2423645 4920 2423645 0 28 169 5789acffb29d541bdafc68570b05d49d8e08dc97 0 0 0 0 0 0 4903185
2021-03-05 AR 324326 68961 2472738 0 totalTestsViral 2728103 359 14903 153 0 86 1530 314732 3/5/2021 00:00 2021-03-05T00:00:00Z 03/04 19:00 5283 14903 0 2021-03-05T00:00:00Z 2728103 0 2472738 255365 4291 992 0 0 0 0 0 0 0 0 479119 81647 0 0 5 570 8452 2797064 8874 2797064 0 10 27 a06fef34948008705e4a705542367a074df3e6f8 0 0 0 0 0 0 3017804

0.1 Rhode Island (as I live in RI now)

df_states %>% filter(state == "RI") %>%
    ggplot() + 
      geom_label(x = first_day, y = 2000, color = "darkgray", label = "total positive", size = 2, hjust = 0) + 
      geom_text(mapping = aes(x = date, y = 2100, label = positive), color = "darkgray", size = 2, angle = 90, hjust = 0)+ 
      #geom_label(x = first_day, y = 800, color = "black", label = "death", size = 2, hjust = 0) + 
      geom_label(x = first_day, y = 2000, color = tableau10[2], label = "positiveIncrease", size = 2, hjust = 0) + 
      geom_label(x = first_day, y = 1900, color = tableau10[1], label = "hospitalizedCurrently", size = 2, hjust = 0) + 
      # geom_line(mapping = aes(x = date, y = death), alpha = 0.7, color = "black", size = LINE_SIZE) + 
      # geom_text(mapping = aes(x = date - 0.5, y = death + 10, label = death), color = "black", size = 1.5) + 
      # geom_point(mapping = aes(x = date, y = death), color = "black", shape = 10) + 
      geom_line(mapping = aes(x = date, y = hospitalizedCurrently), alpha = 0.7, color = tableau10[1], size = LINE_SIZE) + 
      geom_text(mapping = aes(x = date - 0.5, y = hospitalizedCurrently + 20, label = hospitalizedCurrently), color =  tableau10[1], size = 1.25) + 
      geom_point(mapping = aes(x = date, y = hospitalizedCurrently), color = tableau10[1], shape = 15) + 
      geom_line(mapping = aes(x = date, y = positiveIncrease), alpha = 0.7, color = tableau10[2], size = LINE_SIZE) + 
      geom_text(mapping = aes(x = date - 0.5, y = positiveIncrease + 20, label = positiveIncrease), color =  tableau10[2], size = 1.25)+ 
      geom_point(mapping = aes(x = date, y = positiveIncrease), color = tableau10[2]) + 
      scale_x_date(limits = c(first_day, today), breaks = seq(first_day, today, by = "week")) + 
      xlab("Date") + ylab("") + ggtitle("RI")

0.2 US - all states

df_states %>% group_by(date) %>%
    summarise(positiveIncrease = sum(positiveIncrease), hospitalizedCurrently = sum(hospitalizedCurrently), total = sum(positive)) %>% 
    ungroup() %>%
    ggplot() + 
     geom_label(x = first_day, y = 270000, color = "darkgray", label = "total positive: ", size = 2, hjust = 0) +
     geom_text(mapping = aes(x = date, y = 260000, label = total), color = "darkgray", size = 2, angle = 90, hjust = 0) +
     geom_label(x = first_day, y = 250000, color = tableau10[1], label = "hospitalizedCurrently", size = 2, hjust = 0) +
     geom_label(x = first_day, y = 240000, color = tableau10[2], label = "positiveIncrease", size = 2, hjust = 0) +
     geom_line(mapping = aes(x = date, y = hospitalizedCurrently), alpha = 0.7, color = tableau10[1], size = LINE_SIZE) +
     geom_text(mapping = aes(x = date - 0.5, y = hospitalizedCurrently + 5000, label = hospitalizedCurrently), color =  tableau10[1], size = 1.25) +
     geom_point(mapping = aes(x = date, y = hospitalizedCurrently), color = tableau10[1], shape = 15) +
     geom_line(mapping = aes(x = date, y = positiveIncrease), alpha = 0.7, color = tableau10[2], size = LINE_SIZE) +
     geom_text(mapping = aes(x = date - 0.5, y = positiveIncrease + 5000, label = positiveIncrease), color =  tableau10[2], size = 1.25) +
     geom_point(mapping = aes(x = date, y = positiveIncrease), color = tableau10[2]) +
     scale_x_date(limits = c(first_day, today), breaks = seq(first_day, today, by = "week")) +
     xlab("Date") + ylab("") + ggtitle("US - positiveIncrease & hospitalizedCurrently")

0.3 US - daily top-2 contributors

If a state has been a top 2 contributor

as_top <- df_states %>%
    filter(date > first_day)%>%
    mutate(str_date = as.character(date))%>%
    group_by(str_date) %>%
    arrange(positiveIncrease, by_group = TRUE)%>%
    slice_tail(n = 2) %>%
    ungroup() %>%
    summarise(unique(state))
as_top <- unlist(as_top)
    

  
df_states %>%
    filter(state %in% as_top) %>%
    ggplot() +
      stat_steamgraph(mapping = aes(x = date, y = positiveIncrease, group = state, fill = state))  +
      scale_x_date(limits = c(first_day, today), breaks = seq(first_day, today, by = "week"))  +
      scale_y_continuous(breaks = seq(-60000, 60000, by = 10000), labels = c("60000","50000","400000", "30000", "20000", "10000", "0", "10000", "20000", "30000", "40000", "50000","60000")) +
      scale_fill_manual(values = TABEALU20) +
      xlab("Date") + ylab("positiveIncrease") + ggtitle("If a state was a top-2 contributor on a day")

0.4 US - positiveIncrease by state

num_lag <- 21

find_coef <- function(x, y){
  m <- lm(y ~ x)
  return(coef(m)[2])
}


df_colors <-  df_states %>%
  group_by(state)%>%
  arrange(date, .by_group = TRUE) %>%
  slice_tail(n = num_lag) %>% # last N days
  summarise(trend_coef = find_coef(date, positiveIncrease)) %>% 
  mutate(trend_color = ifelse(trend_coef > 0, "increasing", ifelse(trend_coef < 0, "decreasing", "stable"))) %>% 
  ungroup()%>%
  replace(is.na(.), 0) %>%
  select(state, trend_coef, trend_color) 
 
  
df_states %>% 
    inner_join(df_colors, by = "state") %>%
    ggplot() +
      geom_smooth(mapping = aes(x = date, y = positiveIncrease), color = "gray", alpha = 0.3, method = "loess", size = LINE_SIZE) +
      geom_line(mapping = aes(x = date, y = positiveIncrease, color = trend_color), alpha = 0.7, size = LINE_SIZE) +
      geom_point(mapping = aes(x = date, y = positiveIncrease, color = trend_color), size = 1) +
      scale_x_date(limits = c(first_day, today), breaks = seq(first_day, today, by = "month")) +
      scale_colour_tableau() +
      facet_wrap(state ~ ., ncol = 6, scales = "free") +
      xlab("Date") + ylab("") + ggtitle("US - positiveIncrease by state, colored by the trend of last 21 days")

df_states %>% 
    inner_join(df_colors, by = "state") %>%
    mutate(positiveIncreasePerMillion = positiveIncrease / population * 1000000)%>%
    ggplot() +
      geom_smooth(mapping = aes(x = date, y = positiveIncreasePerMillion), color = "gray", alpha = 0.3, method = "loess", size = LINE_SIZE) +
      geom_line(mapping = aes(x = date, y = positiveIncreasePerMillion, color = trend_color), alpha = 0.7, size = LINE_SIZE) +
      geom_point(mapping = aes(x = date, y = positiveIncreasePerMillion, color = trend_color), size = 1) +
      scale_y_continuous(limits = c(0, 1500), breaks = seq(0, 1500, by = 500)) +
      scale_x_date(limits = c(first_day, today), breaks = seq(first_day, today, by = "month")) +
      scale_colour_tableau() +
      facet_wrap(state ~ ., ncol = 6, scales = "free")  +
      xlab("Date") + ylab("") + ggtitle("US - positiveIncreasePerMillion by state, colored by the trend of last 21 days")

0.5 US - hospitalizedCurrently by state

df_states %>% 
    ggplot() +
      geom_smooth(mapping = aes(x = date, y = hospitalizedCurrently), color = "gray", alpha = 0.3, method = "loess", size = LINE_SIZE) +
      geom_line(mapping = aes(x = date, y = hospitalizedCurrently), alpha = 0.7, color = tableau10[3], size = LINE_SIZE) +
      geom_point(mapping = aes(x = date, y = hospitalizedCurrently), color = tableau10[3], size = 1) +
      scale_x_date(limits = c(first_day, today), breaks = seq(first_day, today, by = "month")) +
      facet_wrap(state ~ ., ncol = 6, scales = "free") +
      xlab("Date") + ylab("") + ggtitle("US - hospitalizedCurrently by state")

0.6 US - dailyTestPositiveRate against overallTestedPopulationRate

df_pr <- df_states %>% 
    mutate(testPositiveRate = positiveIncrease / totalTestResultsIncrease, testedPopulationRate = totalTestResults / population) %>%
    filter(testPositiveRate > 0 & testPositiveRate < 1) # rm buggy data to allow log scales
  


df_pr_colors <-  df_pr %>%
  group_by(state)%>%
  arrange(date, .by_group = TRUE) %>%
  slice_tail(n = num_lag) %>% # last N days
  summarise(trend_coef = find_coef(date, testPositiveRate)) %>% 
  mutate(trend_color = ifelse(trend_coef > 0, "increasing", ifelse(trend_coef < 0, "decreasing", "stable"))) %>% 
  ungroup()%>%
  replace(is.na(.), 0) %>%
  select(state, trend_coef, trend_color) 


df_pr_summary <- df_states %>%
  filter(date >  as.Date('2020-07-31') & date < as.Date('2020-11-14'))%>%
  group_by(date) %>%
  summarise(national_positive = sum(positiveIncrease), national_tested = sum(totalTestResultsIncrease))%>%
  mutate(testPositiveRate = national_positive / national_tested)%>%
  ungroup() %>%
  summarise(testPositiveRate_mean = median(testPositiveRate), testPositiveRate_sd = mad(testPositiveRate), per95 = quantile(testPositiveRate, probs = 0.95))


df_pr %>%
 inner_join(df_pr_colors, by = "state") %>%
 ggplot() +
    geom_smooth(mapping = aes(x = testedPopulationRate, y = testPositiveRate), color = "gray", alpha = 0.3, method = "loess", size = LINE_SIZE) +
    geom_line(mapping = aes(x = testedPopulationRate, y = testPositiveRate, color = trend_color), alpha = 0.7, size = LINE_SIZE) +
    geom_point(mapping = aes(x = testedPopulationRate, y = testPositiveRate, color = trend_color), size = 1) +
    scale_x_continuous(limits = c(0, 2.5), breaks = seq(0, 2.5, by = 0.25)) +
    scale_y_continuous(limits = c(0.001, 1), trans = 'log10', breaks = c(0.001, 0.01, 0.05, 0.1, 0.2, 0.3, 0.5, 0.75, 1)) +
    scale_colour_tableau() +
    facet_wrap(state ~ ., ncol = 6, scales = "free")  +
    xlab("dailyTestPositiveRate") + ylab("overallTestedPopulationRate") + ggtitle("US - dailyTestPositiveRate against overallTestedPopulationRate")

0.7 US - death per 10k by state

df_states %>% 
    mutate(deathPer10K = death / population * 10000) %>%
    ggplot() +
     geom_line(mapping = aes(x = date, y = deathPer10K), alpha = 0.7, color = tableau10[3], size = LINE_SIZE) +
     geom_point(mapping = aes(x = date, y = deathPer10K), color = tableau10[3], size = 1) +
     scale_x_date(limits = c(first_day, today), breaks = seq(first_day, today, by = "month")) +
     scale_y_continuous(limits = c(0, 50), breaks = seq(0, 50, by = 10)) +
     facet_wrap(state ~ ., ncol = 6, scales = "free")  +
     xlab("date") + ylab("death per 10k") + ggtitle("US - death per 10k by state")

0.8 US - positive per 1k by state

df_states %>% 
    mutate(positivePerOneK = positive / population * 1000) %>%
    ggplot() +
      geom_line(mapping = aes(x = date, y = positivePerOneK), alpha = 0.7, color = tableau10[4], size = LINE_SIZE) +
      geom_point(mapping = aes(x = date, y = positivePerOneK), color = tableau10[4], size = 1) +
      scale_y_continuous(limits = c(0, 150), breaks = seq(0, 150, by = 30)) +
      scale_x_date(limits = c(first_day, today), breaks = seq(first_day, today, by = "month")) +
      facet_wrap(state ~ ., ncol = 6, scales = "free") +
      xlab("date") + ylab("") + ggtitle("US - positivePerOneK by state")

0.9 US - tested amount by state

df_states %>% 
    mutate(testResultsIncrease = positiveIncrease + negativeIncrease) %>%
    ggplot() +
      geom_smooth(mapping = aes(x = date, y = testResultsIncrease), color = "gray", alpha = 0.3, method = "loess", size = LINE_SIZE) +
      geom_line(mapping = aes(x = date, y = testResultsIncrease), alpha = 0.7, color = tableau10[7], size = LINE_SIZE) +
      geom_point(mapping = aes(x = date, y = testResultsIncrease), color = tableau10[7], size = 1) +
      scale_x_date(limits = c(first_day, today), breaks = seq(first_day, today, by = "month")) +
      facet_wrap(state ~ ., ncol = 6, scales = "free")  +
      xlab("date") + ylab("testResultsIncrease") + ggtitle("US - testResultsIncrease by state")